#! /usr/bin/perl

#*****************************************************************************
# IrstLM: IRST Language Model Toolkit
# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy

# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.

# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA

#******************************************************************************
#merge prefix LMs into one single file

use strict;
use Getopt::Long "GetOptions";
use File::Basename;

my ($help,$lm,$size,$sublm,$backoff)=();
$help=0;
$backoff=0;

&GetOptions('size=i' => \$size,
'lm=s' => \$lm,
'sublm=s' => \$sublm,
'backoff' => \$backoff,
'h|help' => \$help);

if ($help || !$size || !$lm || !$sublm) {
	my $cmnd = basename($0);
  print "\n$cmnd - merge single LMs\n",
	"\nUSAGE:\n",
	"       $cmnd [options]\n",
	"\nOPTIONS:\n",
	"       --size <int>          maximum n-gram size for the language model\n",
	"       --sublm <string>      path identifying all input prefix sub LMs\n",
	"       --lm <string>         name of the output LM file (will be gzipped)\n",
	"       --backoff						  (optional) create a backoff LM, output is directly in ARPA format (default is false, i.e. iARPA format) \n",
	"       -h, --help            (optional) print these instructions\n",
	"\n";

  exit(1);
}


my $gzip=`which gzip 2> /dev/null`;
my $gunzip=`which gunzip 2> /dev/null`;
chomp($gzip);
chomp($gunzip);

warn "merge-sublm.pl --size $size --sublm $sublm --lm $lm --backoff $backoff\n";

warn "Compute total sizes of n-grams\n";
my @size=();          #number of n-grams for each level
my $tot1gr=0;         #total frequency of 1-grams
my $unk=0;            #frequency of <unk>
my $pr;               #probability of 1-grams
my (@files,$files);   #sublm files for a given n-gram size  

for (my $n=1;$n<=$size;$n++){

  @files=map { glob($_) } "${sublm}*.${n}gr*";
  $files=join(" ",@files);
  $files || die "cannot find sublm files\n";
  warn "join files $files\n";
  
  if ($n==1){
    open(INP,"$gunzip -c $files|") || die "cannot open $files\n";
    while(my $line = <INP>){
      $size[$n]++;
      chomp($line);
      warn "there is an empty line in any of these files ($files); this should not happen\n" if $line =~ /^$/;
      my @words = split(/[ \t]+/,$line);
      #cut down counts for sentence initial
      $words[0]=1 if $words[1]=~/<s>/;
      #there could be more independent <unk> words
      #generated by ngt with -sd option
      $size[$n]-- if $unk && $words[1] eq "<unk>";
      $unk+=$words[0] if $words[1]=~/<unk>/i;
      $tot1gr+=$words[0];
    }
    close(INP);
    if ($unk==0){
      warn "implicitely add <unk> word to counters\n";
      $tot1gr+=$size[$n]; #equivalent to WB smoothing
      $size[$n]++; 
    }
  }else{
    for (my $j=0;$j<scalar(@files);$j++){
      safesystem("$gunzip -c $files[$j] | grep -v '10000.000' | wc -l > wc$$") or die;
      open(INP,"wc$$") || die "cannot open wc$$\n";
      my $wc = <INP>;
      chomp($wc);
      $size[$n] += $wc;
      close(INP);
      unlink("wc$$");
    }
  }
  warn "n:$n size:$size[$n] unk:$unk\n";
}

warn "Merge all sub LMs\n";

$lm.=".gz" if $lm!~/.gz$/;
open(LM,"|$gzip -c > $lm") || die "Cannot open $lm\n";

warn "Write LM Header\n";
if ($backoff){
	printf LM "ARPA\n\n";
} else{
	printf LM "iARPA\n\n";
}

printf LM "\\data\\\n";
for (my $n=1;$n<=$size;$n++){
    printf LM "ngram $n=\t$size[$n]\n";
}
printf LM "\n";
close(LM);

warn "Writing LM Tables\n";
for (my $n=1;$n<=$size;$n++){
  
  warn "Level $n\n";
  
  @files=map { glob($_) } "${sublm}*.${n}gr*";
  $files=join(" ",@files);
  warn "input from: $files\n";
  if ($n==1){         
    open(INP,"$gunzip -c $files|") || die "cannot open $files\n";
    open(LM,"|$gzip -c >> $lm");
    printf LM "\\$n-grams:\n";
    while(my $line = <INP>){   
      chomp($line);
      warn "there is an empty line in any of these files ($files); this should not happen\n" if $line =~ /^$/;
	 #lowercase some expressions of google n-grams
      $line=~s/<S>/<s>/g;
      $line=~s/<\/S>/<\/s>/g;
      $line=~s/<UNK>/<unk>/g;

      my @words = split(/[ \t]+/,$line);

      #always print unk a the eqnd
      next if $words[1]=~/<unk>/i;

      #cut down counts for sentence initial
      $words[0]=1 if $words[1]=~/<s>/i;	  
	  	
      #apply witten-bell smoothing on 1-grams
      $pr=(log($words[0]+1)-log($tot1gr+$size[1]))/log(10.0);
      shift @words;
      printf LM "%f\t%s\t%f\n",$pr,$words[0],$words[1];
    }
    close(INP);

    #print final <unk>
    #witten-bell smoothing of <unk> probability
    if ($unk){
      $pr=(log($unk+1)-log($tot1gr+$size[1]))/log(10.0);
    }else{
      $pr=(log($size[1]-1+1)-log($tot1gr+$size[1]))/log(10.0);
    }

    printf LM "%f <unk>\n",$pr;
    close(LM);
  }else{
    open(LM,"|$gzip -c >> $lm");
    printf LM "\\$n-grams:\n";
    close(LM);
    for (my $j=0;$j<scalar(@files);$j++){
      safesystem("$gunzip -c $files[$j] | grep -v '10000.000' | gzip -c >> $lm") or die;
    }
  }

}

open(LM,"|$gzip -c >> $lm") || die "Cannot open $lm\n";
printf LM "\\end\\\n";
close(LM);

sub safesystem {
  print STDERR "Executing: @_\n";
  system(@_);
  if ($? == -1) {
      print STDERR "Failed to execute: @_\n  $!\n";
      exit(1);
  }
  elsif ($? & 127) {
      printf STDERR "Execution of: @_\n  died with signal %d, %s coredump\n",
          ($? & 127),  ($? & 128) ? 'with' : 'without';
      exit(1);
  }
  else {
    my $exitcode = $? >> 8;
    print STDERR "Exit code: $exitcode\n" if $exitcode;
    return ! $exitcode;
  }
}

